##################################
#### Name: 08_relatio_extract_features.py
### Purpose: This code generates RELATIO narrative features using default settings and 
## joins them back to the original document set. We use started code provided by
## Germain Gauthier, one of the RELATIO authors to accomplish this. 
## The RELATIO paper write up can be found here: https://www.cambridge.org/core/journals/political-analysis/article/relatio-text-semantics-capture-political-and-economic-narratives/E72C0482A44C9A817E381B394A73E2D6
## The RELATIO package can be found here: https://pypi.org/project/relatio/. 
### Data In:
## 1) Bioweapons case study articles
## /scratch/olympus/projects/russia_ukraine_war/bioweapons_new/bioweapons_casestudy_5_20_2024.csv
### Data Out:
## 1) /scratch/olympus/projects/russia_ukraine_war/bioweapons/concatenated_df.csv

## Notes:
## This code cannot be replicated 
## because it relies on full text versions
## of the articles. We have included the code
## as a reference for replicators and with 
## full file paths for reference for the research
## team. 

## Installation of relatio ##
## upgrade pip, wheel and setuptools
# python -m pip install -U pip wheel setuptools
#
## install the package
# python -m pip install -U relatio

import sys
import os

# Add library path to miniconda for Spacy
os.environ['LD_LIBRARY_PATH'] = '$LD_LIBRARY_PATH:/ext3/miniconda3/lib'

path = "/scratch/olympus/projects/russia_ukraine_war/bioweapons/"
os.chdir(path)

import pandas as pd
from relatio import FileLogger
from relatio import Preprocessor
from relatio import SRL
from relatio import extract_roles
from relatio import load_data
import pickle as pk 

# Catch warnings 
logger = FileLogger(level = 'WARNING')
sys.version_info 

art = pd.read_csv("/scratch/olympus/projects/russia_ukraine_war/bioweapons_new/bioweapons_casestudy_5_20_2024.csv")

df = art
df.columns
df.rename(columns={'article_id':'id', 'content_translated':'doc'}, inplace=True)

df = df[['id', 'doc']]

p = Preprocessor(
    spacy_model = "en_core_web_sm",
    remove_punctuation = True,
    remove_digits = True,
    lowercase = True,
    lemmatize = True,
    remove_chars = ["\"",'-',"^",".","?","!",";","(",")",",",":","\'","+","&","|","/","{","}",
                    "~","_","`","[","]",">","<","=","*","%","$","@","#","’"],
    stop_words = [],
    n_process = 1,
    batch_size = 100
)


df = p.split_into_sentences(
    df, output_path = path+'RELATIO_pre_processor_output.csv', progress_bar = True
)

# with open(path+'df_sentences.pkl','wb') as f:
#     pk.dump(df, f)

# semantic role labeling: "takes in a plain-text sentence and identifies the action, the agent performing that action, 
# and the patient being acted upon."

SRL = SRL(
    path = "https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz",
    batch_size = 10,
    cuda_device = -1
)

srl_res = SRL(df['sentence'], progress_bar=True)


# Save the SRLs
# with open(path+'srl_res.pkl','wb') as f:
#     pk.dump(srl_res, f)


roles, sentence_index = extract_roles(
    srl_res, 
    used_roles = ["ARG0","B-V","B-ARGM-NEG","B-ARGM-MOD","ARG1","ARG2"],
    only_triplets = True,
    progress_bar = True
)

for d in roles[0:20]: print(d)

# Extract Subject-Verb-Object (patient) tuples:
sentence_index, roles = p.extract_svos(df['sentence'], expand_nouns = True, only_triplets = False, progress_bar = True) 

for svo in roles[0:20]: print(svo)

# Clean up text of raw extracted semantic roles. See also 
# https://github.com/relatio-nlp/relatio/blob/d686104671adb4f6bf6340228266a1bb7160d5f2/relatio/preprocessing.py
postproc_roles = p.process_roles(roles, 
                                 max_length = 50,
                                 progress_bar = True,
                                 output_path = path+'postproc_roles.json')

for d in postproc_roles[0:20]: print(d)

from relatio.utils import load_roles
postproc_roles = load_roles(path+'postproc_roles.json')

# directly extract coherent entities using named entity recognition. See also 
# https://github.com/relatio-nlp/relatio/blob/d686104671adb4f6bf6340228266a1bb7160d5f2/relatio/preprocessing.py
known_entities = p.mine_entities(
    df['sentence'], 
    clean_entities = True, 
    progress_bar = True,
    output_path = path+'entities.pkl'
)

for n in known_entities.most_common(10): print(n)

from relatio.utils import load_entities
known_entities = load_entities(path+'entities.pkl')

top_known_entities = [e[0] for e in list(known_entities.most_common(100)) if e[0] != '']

# semantic clustering on agent-patient phrases containing top-ranked entities 
from relatio.narrative_models import NarrativeModel

m = NarrativeModel(
    clustering = 'kmeans',
    PCA = True,
    UMAP = True,
    roles_considered = ['ARG0', 'B-V', 'B-ARGM-NEG', 'ARG1'],
    roles_with_known_entities = ['ARG0','ARG1'],
    known_entities = top_known_entities,
    assignment_to_known_entities = 'embeddings',
    roles_with_unknown_entities = ['ARG0','ARG1'],
    threshold = 0.1
)    


m.fit(postproc_roles, progress_bar = True) 

# Save model
# with open(path+'narrative_model.pkl','wb') as f:
#     pk.dump(m,f)


# Uncomment to see plots
# m.plot_selection_metric(metric = 'inertia') 
# m.plot_clusters(path = path+'clusters.pdf') 
# m.clusters_to_txt(path = path+'clusters.txt')

# Apply narrative model created above to roles extracted from sentence data
narratives = m.predict(postproc_roles, progress_bar = True)

# Save model
# with open(path+'narrative_preds.pkl','wb') as f:
#     pk.dump(narratives,f)


from relatio.utils import prettify

pretty_narratives = []
for n in narratives: 
    pretty_narratives.append(prettify(n))

# for i in range(10):           
#     print(roles[i])
#     print(postproc_roles[i])
#     print(pretty_narratives[i])


# Uncomment to see graphs
# from relatio import build_graph, draw_graph

# G = build_graph(
#     narratives, 
#     top_n = 100, 
#     prune_network = True
# )

# draw_graph(
#     G,
#     notebook = True,
#     show_buttons = False,
#     width="1600px",
#     height="1000px",
#     output_filename = path+'network_of_narratives.html'
#     )


# Merge with the original dataframe

art = pd.read_csv("/scratch/olympus/projects/russia_ukraine_war/bioweapons_new/bioweapons_casestudy_5_20_2024.csv")
art.rename(columns={'article_id':'id'}, inplace=True)

concatenated_df = pd.DataFrame({"sentence_index" : sentence_index, "roles" : roles, "postproc_roles" : postproc_roles, "narrative" : narratives})
df["sentence_index"] = df.index
concatenated_df = df.merge(concatenated_df, on = "sentence_index")
concatenated_df = art.merge(concatenated_df, on = 'id')
# concatenated_df.to_csv('concatenated_df.csv')
